{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 15: Machine learning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Robert Johansson\n",
    "\n",
    "Source code listings for [Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib](https://www.apress.com/us/book/9781484242452) (ISBN 978-1-484242-45-2)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "from sklearn import datasets\n",
    "from sklearn import model_selection\n",
    "from sklearn import linear_model\n",
    "from sklearn import metrics\n",
    "from sklearn import tree\n",
    "from sklearn import neighbors\n",
    "\n",
    "from sklearn import svm\n",
    "from sklearn import ensemble\n",
    "from sklearn import cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib as mpl\n",
    "mpl.rcParams['mathtext.fontset'] = 'stix'\n",
    "mpl.rcParams['font.family'] = 'serif'\n",
    "mpl.rcParams['font.sans-serif'] = 'stix'\n",
    "\n",
    "sns.set(style=\"whitegrid\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set(style=\"darkgrid\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Built in datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "datasets.load_wine #()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "datasets.fetch_california_housing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "datasets.make_regression"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "np.random.seed(123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "X_all, y_all = datasets.make_regression(n_samples=50, n_features=50, n_informative=10) #, noise=2.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = model_selection.train_test_split(X_all, y_all, train_size=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "X_train.shape, y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "X_test.shape, y_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model = linear_model.LinearRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def sse(resid):\n",
    "    return sum(resid**2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "resid_train = y_train - model.predict(X_train)\n",
    "sse_train = sse(resid_train)\n",
    "sse_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "resid_test = y_test - model.predict(X_test)\n",
    "sse_test = sse(resid_train)\n",
    "sse_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.score(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def plot_residuals_and_coeff(resid_train, resid_test, coeff):\n",
    "    fig, axes = plt.subplots(1, 3, figsize=(12, 3))\n",
    "    axes[0].bar(np.arange(len(resid_train)), resid_train)\n",
    "    axes[0].set_xlabel(\"sample number\")\n",
    "    axes[0].set_ylabel(\"residual\")\n",
    "    axes[0].set_title(\"training data\")\n",
    "    axes[1].bar(np.arange(len(resid_test)), resid_test)\n",
    "    axes[1].set_xlabel(\"sample number\")\n",
    "    axes[1].set_ylabel(\"residual\")\n",
    "    axes[1].set_title(\"testing data\")\n",
    "    axes[2].bar(np.arange(len(coeff)), coeff)\n",
    "    axes[2].set_xlabel(\"coefficient number\")\n",
    "    axes[2].set_ylabel(\"coefficient\")\n",
    "    fig.tight_layout()\n",
    "    return fig, axes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)\n",
    "fig.savefig(\"ch15-regression-ols.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model = linear_model.Ridge() #alpha=2.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "resid_train = y_train - model.predict(X_train)\n",
    "sse_train = sum(resid_train**2)\n",
    "sse_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "resid_test = y_test - model.predict(X_test)\n",
    "sse_test = sum(resid_test**2)\n",
    "sse_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.score(X_train, y_train), model.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)\n",
    "fig.savefig(\"ch15-regression-ridge.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model = linear_model.Lasso(alpha=1.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "resid_train = y_train - model.predict(X_train)\n",
    "sse_train = sse(resid_train)\n",
    "sse_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "resid_test = y_test - model.predict(X_test)\n",
    "sse_test = sse(resid_test)\n",
    "sse_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)\n",
    "fig.savefig(\"ch15-regression-lasso.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "alphas = np.logspace(-4, 2, 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "coeffs = np.zeros((len(alphas), X_train.shape[1]))\n",
    "sse_train = np.zeros_like(alphas)\n",
    "sse_test = np.zeros_like(alphas)\n",
    "\n",
    "for n, alpha in enumerate(alphas):\n",
    "    model = linear_model.Lasso(alpha=alpha)\n",
    "    model.fit(X_train, y_train)\n",
    "    coeffs[n, :] = model.coef_\n",
    "    resid = y_train - model.predict(X_train)\n",
    "    sse_train[n] = sum(resid**2)\n",
    "    resid = y_test - model.predict(X_test)\n",
    "    sse_test[n] = sum(resid**2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharex=True)\n",
    "\n",
    "for n in range(coeffs.shape[1]):\n",
    "    axes[0].plot(np.log10(alphas), coeffs[:, n], color='k', lw=0.5)\n",
    "\n",
    "axes[1].semilogy(np.log10(alphas), sse_train, label=\"train\")\n",
    "axes[1].semilogy(np.log10(alphas), sse_test, label=\"test\")\n",
    "axes[1].legend(loc=0)\n",
    "\n",
    "axes[0].set_xlabel(r\"${\\log_{10}}\\alpha$\", fontsize=18)\n",
    "axes[0].set_ylabel(r\"coefficients\", fontsize=18)\n",
    "axes[1].set_xlabel(r\"${\\log_{10}}\\alpha$\", fontsize=18)\n",
    "axes[1].set_ylabel(r\"sse\", fontsize=18)\n",
    "fig.tight_layout()\n",
    "fig.savefig(\"ch15-regression-lasso-vs-alpha.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model = linear_model.LassoCV()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.fit(X_all, y_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.alpha_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "resid_train = y_train - model.predict(X_train)\n",
    "sse_train = sse(resid_train)\n",
    "sse_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "resid_test = y_test - model.predict(X_test)\n",
    "sse_test = sse(resid_test)\n",
    "sse_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.score(X_train, y_train), model.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)\n",
    "fig.savefig(\"ch15-regression-lasso-cv.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "model = linear_model.ElasticNetCV()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.fit(X_all, y_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.alpha_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.l1_ratio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "resid_train = y_train - model.predict(X_train)\n",
    "sse_train = sum(resid_train**2)\n",
    "sse_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "resid_test = y_test - model.predict(X_test)\n",
    "sse_test = sum(resid_test**2)\n",
    "sse_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "model.score(X_train, y_train), model.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "fig, ax = plot_residuals_and_coeff(resid_train, resid_test, model.coef_)\n",
    "fig.savefig(\"ch15-regression-elastic-net-cv.pdf\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "iris = datasets.load_iris()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "type(iris)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "iris.target_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "iris.feature_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "iris.data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "iris.target.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# print(iris['DESCR'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = model_selection.train_test_split(iris.data, iris.target, train_size=0.7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "classifier = linear_model.LogisticRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "classifier.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "y_test_pred = classifier.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "print(metrics.classification_report(y_test, y_test_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "np.bincount(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "metrics.confusion_matrix(y_test, y_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "classifier = tree.DecisionTreeClassifier()\n",
    "classifier.fit(X_train, y_train)\n",
    "y_test_pred = classifier.predict(X_test)\n",
    "metrics.confusion_matrix(y_test, y_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "classifier = neighbors.KNeighborsClassifier()\n",
    "classifier.fit(X_train, y_train)\n",
    "y_test_pred = classifier.predict(X_test)\n",
    "metrics.confusion_matrix(y_test, y_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "classifier = svm.SVC()\n",
    "classifier.fit(X_train, y_train)\n",
    "y_test_pred = classifier.predict(X_test)\n",
    "metrics.confusion_matrix(y_test, y_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "classifier = ensemble.RandomForestClassifier()\n",
    "classifier.fit(X_train, y_train)\n",
    "y_test_pred = classifier.predict(X_test)\n",
    "metrics.confusion_matrix(y_test, y_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "train_size_vec = np.linspace(0.1, 0.9, 30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "classifiers = [tree.DecisionTreeClassifier,\n",
    "               neighbors.KNeighborsClassifier,\n",
    "               svm.SVC,\n",
    "               ensemble.RandomForestClassifier\n",
    "              ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "cm_diags = np.zeros((3, len(train_size_vec), len(classifiers)), dtype=float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "for n, train_size in enumerate(train_size_vec):\n",
    "    X_train, X_test, y_train, y_test = \\\n",
    "        model_selection.train_test_split(iris.data, iris.target, train_size=train_size)\n",
    "\n",
    "    for m, Classifier in enumerate(classifiers): \n",
    "        classifier = Classifier()\n",
    "        classifier.fit(X_train, y_train)\n",
    "        y_test_pred = classifier.predict(X_test)\n",
    "        cm_diags[:, n, m] = metrics.confusion_matrix(y_test, y_test_pred).diagonal()\n",
    "        cm_diags[:, n, m] /= np.bincount(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(1, len(classifiers), figsize=(12, 3))\n",
    "\n",
    "for m, Classifier in enumerate(classifiers): \n",
    "    axes[m].plot(train_size_vec, cm_diags[2, :, m], label=iris.target_names[2])\n",
    "    axes[m].plot(train_size_vec, cm_diags[1, :, m], label=iris.target_names[1])\n",
    "    axes[m].plot(train_size_vec, cm_diags[0, :, m], label=iris.target_names[0])\n",
    "    axes[m].set_title(type(Classifier()).__name__)\n",
    "    axes[m].set_ylim(0, 1.1)\n",
    "    axes[m].set_xlim(0.1, 0.9)\n",
    "    axes[m].set_ylabel(\"classification accuracy\")\n",
    "    axes[m].set_xlabel(\"training size ratio\")\n",
    "    axes[m].legend(loc=4)\n",
    "\n",
    "fig.tight_layout()\n",
    "fig.savefig(\"ch15-classification-comparison.pdf\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, y = iris.data, iris.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "np.random.seed(123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "n_clusters = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "c = cluster.KMeans(n_clusters=n_clusters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "c.fit(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "y_pred = c.predict(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "y_pred[::8]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "y[::8]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "idx_0, idx_1, idx_2 = (np.where(y_pred == n) for n in range(3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "y_pred[idx_0], y_pred[idx_1], y_pred[idx_2] = 2, 0, 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "y_pred[::8]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "metrics.confusion_matrix(y, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "N = X.shape[1]\n",
    "\n",
    "fig, axes = plt.subplots(N, N, figsize=(12, 12), sharex=True, sharey=True)\n",
    "\n",
    "colors = [\"coral\", \"blue\", \"green\"]\n",
    "markers = [\"^\", \"v\", \"o\"]\n",
    "for m in range(N):\n",
    "    for n in range(N):\n",
    "        for p in range(n_clusters):\n",
    "            mask = y_pred == p\n",
    "            axes[m, n].scatter(X[:, m][mask], X[:, n][mask],\n",
    "                               marker=markers[p], s=30, \n",
    "                               color=colors[p], alpha=0.25)\n",
    "\n",
    "        for idx in np.where(y != y_pred):\n",
    "            axes[m, n].scatter(X[idx, m], X[idx, n],\n",
    "                               marker=\"s\", s=30, \n",
    "                               edgecolor=\"red\", \n",
    "                               facecolor=(1,1,1,0))\n",
    "            \n",
    "            \n",
    "    axes[N-1, m].set_xlabel(iris.feature_names[m], fontsize=16)\n",
    "    axes[m, 0].set_ylabel(iris.feature_names[m], fontsize=16)\n",
    "fig.tight_layout()\n",
    "fig.savefig(\"ch15-clustering.pdf\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Versions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%reload_ext version_information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "%version_information sklearn, numpy, matplotlib, seaborn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "npbook_py310",
   "language": "python",
   "name": "npbook_py310"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
